This script demonstrates samples in the PBTA cluster by composition and cancer type using dimensionality reduction techniques, namely, PCA, t-SNE and UMAP.

Set Up

# magrittr pipe
`%>%` <- dplyr::`%>%`

# Function to align metadata to data.frame with dimension reduction scores
reduction_fn <- function(name, id) {
  name$ID <- id
  metadata <- df2 %>%
    dplyr::filter(Kids_First_Biospecimen_ID %in% name$ID) %>%
    dplyr::filter(!duplicated(Kids_First_Biospecimen_ID))
  name <- name %>%
    dplyr::filter(ID %in% metadata$Kids_First_Biospecimen_ID)
  name$type <- as.factor(metadata$disease_type_new)
  name$composition <- as.factor(metadata$composition)
  return(name)
}

# Create directory to hold the output plots.
if (!dir.exists("plots")) {
  dir.create("plots")
}

# Read in dataset
df2 <- data.frame(readr::read_tsv(
  file.path("..", "..", "data", "pbta-histologies.tsv")
))
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   age_at_diagnosis = col_double()
## )
## See spec(...) for full column specifications.
# Read in kallisto expression data 
exp_kallisto <- data.frame(readr::read_rds(
  file.path("..", "..", "data", "pbta-gene-expression-kallisto.rds")
))

# Read in RSEM expression data 
exp_rsem <- data.frame(readr::read_rds(
  file.path("..", "..", "data", "pbta-gene-expression-rsem.fpkm.rds")
))

Prep Data

# Transform the non-numeric "gene_id" column into rownames 
exp_kallisto <- exp_kallisto[, -1] %>%
  dplyr::filter(!duplicated(gene_id)) %>%
  tibble::column_to_rownames("gene_id")

exp_rsem <- exp_rsem %>%
  tibble::column_to_rownames("gene_id") %>%
  na.omit()

# Transpose the data 
transposed_rsem_data <- t(exp_rsem)

transposed_kallisto_data <- t(exp_kallisto)

# Save rownames as a vector
rsem_ID <- rownames(transposed_rsem_data)
kallisto_ID <- rownames(transposed_kallisto_data)

RSEM

Run PCA

# Run PCA on RSEM 
rsem_pca <- prcomp(transposed_rsem_data)
# Make a data.frame with PCA scores
rsem_pca_data <- data.frame(rsem_pca$x[, 1:2])
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
rsem_pca_data <- reduction_fn(rsem_pca_data, rsem_ID)
# Plot the PCA scores and color by cancer type 
rsem_pca_plot <- ggplot2::ggplot(rsem_pca_data,
                                 ggplot2::aes(x = rsem_pca_data[, 1],
                                              y = rsem_pca_data[, 2],
                                              color = type)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")
# Plot the PCA scores and color by composition
rsem_pca_plot_composition <- ggplot2::ggplot(rsem_pca_data,
                                             ggplot2::aes(x = rsem_pca_data[, 1],
                                                          y = rsem_pca_data[, 2],
                                                          color = composition)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")

Run t-SNE

# Run t-SNE on RSEM
rsem_tsne <- Rtsne::Rtsne(transposed_rsem_data)
# Make a data.frame with t-SNE scores
rsem_tsne_data <- data.frame(rsem_tsne$Y)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
rsem_tsne_data <- reduction_fn(rsem_tsne_data, rsem_ID)
# Plot the t-SNE scores and color by cancer type 
rsem_tsne_plot <- ggplot2::ggplot(rsem_tsne_data,
                                  ggplot2::aes(x = rsem_tsne_data[, 1],
                                               y = rsem_tsne_data[, 2],
                                               color = type)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")
# Plot the t-SNE scores and color by composition 
rsem_tsne_plot_composition <- ggplot2::ggplot(rsem_tsne_data,
                                              ggplot2::aes(x = rsem_tsne_data[, 1],
                                                           y = rsem_tsne_data[, 2],
                                                           color = composition)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")

Run UMAP

# Run UMAP on RSEM
rsem_umap <- umap::umap(transposed_rsem_data)
# Make a data.frame with umap scores
rsem_umap_data <- data.frame(rsem_umap$layout)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
rsem_umap_data <- reduction_fn(rsem_umap_data, rsem_ID)

# Plot the umap scores and color by cancer type 
rsem_umap_plot <- ggplot2::ggplot(rsem_umap_data,
                                  ggplot2::aes(x = rsem_umap_data[, 1],
                                               y = rsem_tsne_data[, 2],
                                               color = type)) +
  ggplot2::geom_point(size = 1) +
  ggplot2::theme(legend.position = "bottom", legend.text = ggplot2::element_text(size = 3))
# Plot the umap scores and color by composition 
rsem_umap_plot_composition <- ggplot2::ggplot(rsem_umap_data,
                                              ggplot2::aes(x = rsem_umap_data[, 1],
                                                           y = rsem_tsne_data[, 2],
                                                           color = composition)) +
  ggplot2::geom_point(size = 1) +
  ggplot2::theme(legend.position = "none")

Plots Grid

# Plot grid with RSEM data colored by composition vs tumor type
meta_grid_rsem <- gridExtra::grid.arrange(rsem_pca_plot_composition,
                                     rsem_tsne_plot_composition,
                                     rsem_umap_plot_composition,
                                     rsem_pca_plot,
                                     rsem_tsne_plot,
                                     rsem_umap_plot,
                                     ncol = 3,
                                     top = "Composition and Cancer Types, respectively (RSEM)")

# Save grid
ggplot2::ggsave(file.path("plots", "meta_grid_rsem.pdf"), meta_grid_rsem, width = 16, height = 18)

Kallisto

Run PCA

# Run PCA on kallisto
kallisto_pca <- prcomp(transposed_kallisto_data)
# Make a data.frame with PCA scores
kallisto_pca_data <- data.frame(kallisto_pca$x[, 1:2])
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
kallisto_pca_data <- reduction_fn(kallisto_pca_data, kallisto_ID)
# Plot the PCA scores and color by cancer type 
kallisto_pca_plot <- ggplot2::ggplot(kallisto_pca_data,
                                     ggplot2::aes(x = kallisto_pca_data[, 1],
                                                  y = kallisto_pca_data[, 2],
                                                  color = type)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")
# Plot the PCA scores and color by composition
kallisto_pca_plot_composition <- ggplot2::ggplot(kallisto_pca_data,
                                                 ggplot2::aes(x = kallisto_pca_data[, 1],
                                                              y = kallisto_pca_data[, 2],
                                                              color = composition)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")

Run t-SNE

# Run t-SNE on kallisto
kallisto_tsne <- Rtsne::Rtsne(transposed_kallisto_data)
# Make a data.frame with t-SNE scores
kallisto_tsne_data <- data.frame(kallisto_tsne$Y)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
kallisto_tsne_data <- reduction_fn(kallisto_tsne_data, kallisto_ID)
# Plot the t-SNE scores and color by cancer type 
kallisto_tsne_plot <- ggplot2::ggplot(kallisto_tsne_data,
                                      ggplot2::aes(x = kallisto_tsne_data[, 1],
                                                   y = kallisto_tsne_data[, 2],
                                                   color = type)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")
# Plot the t-SNE scores and color by composition 
kallisto_tsne_plot_composition <- ggplot2::ggplot(kallisto_tsne_data,
                                                  ggplot2::aes(x = kallisto_tsne_data[, 1],
                                                               y = kallisto_tsne_data[, 2],
                                                               color = composition)) +
  ggplot2::geom_point() +
  ggplot2::theme(legend.position = "none")

Run UMAP

# Run UMAP on kallisto
kallisto_umap <- umap::umap(transposed_kallisto_data)
# Make a data.frame with umap scores
kallisto_umap_data <- data.frame(kallisto_umap$layout)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
kallisto_umap_data <- reduction_fn(kallisto_umap_data, kallisto_ID)
# Plot the umap scores and color by cancer type 
kallisto_umap_plot <- ggplot2::ggplot(kallisto_umap_data,
                                      ggplot2::aes(x = kallisto_umap_data[, 1],
                                                   y = kallisto_umap_data[, 2],
                                                   color = type)) +
  ggplot2::geom_point(size = 1) +
  ggplot2::theme(legend.position = "bottom", legend.text = ggplot2::element_text(size = 4))
# Plot the umap scores and color by composition
kallisto_umap_plot_composition <- ggplot2::ggplot(kallisto_umap_data,
                                                  ggplot2::aes(x = kallisto_umap_data[, 1],
                                                               y = kallisto_umap_data[, 2],
                                                               color = composition)) +
  ggplot2::geom_point(size = 1) +
  ggplot2::theme(legend.position = "bottom", legend.text = ggplot2::element_text(size = 4))

Plots Grid

# Plot grid with kallisto data colored by composition vs tumor type
meta_grid_kallisto <- gridExtra::grid.arrange(kallisto_pca_plot_composition,
                                              kallisto_tsne_plot_composition,
                                              kallisto_umap_plot_composition,
                                              kallisto_pca_plot,
                                              kallisto_tsne_plot,
                                              kallisto_umap_plot,
                                              ncol = 3,
                                              top = "Composition and Cancer Types, respectively (kallisto)")

# Save grid
ggplot2::ggsave(file.path("plots", "meta_grid_kallisto.pdf"), meta_grid_kallisto, width = 23, height = 19)